import os
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
import sys
import smogn
Data is loaded from the CSV files and read into DataFrames
seed = 1
tf.random.set_seed(seed)
warnings.filterwarnings("ignore", category=DeprecationWarning)
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_rows', 1000)
wd = os.getcwd()
train_file = wd + '//train.csv'
valid_file = wd + '//test.csv'
train_df = pd.read_csv(train_file)
valid_df = pd.read_csv(valid_file)
Firstly, the counts, mean, standard deviation, min, quartile and max for numerical columns. From the counts of the training data it is observed that there is a maximum of 3700 rows where some rows have fewer, most likely from NAs which is to be confirmed below. Similarly, the validation dataset has a maximum of 200 rows with a few variables having less than that.
train_df.describe()
valid_df.describe()
Next, the number of blank rows (NAs) is tabularized by variable .
na_list = []
col_list = train_df.columns
for col in col_list:
na_list.append(train_df[col].isna().sum())
na_df = pd.DataFrame({'Name':col_list,'Count':na_list})
na_df['Percentage'] = na_df['Count']/len(valid_df)
na_df['Percentage'] = na_df['Percentage'].apply(lambda x:'%.1f' %x)
na_df.sort_values('Percentage',ascending=False)
It is observed that variable v16 has the highest number of NAs by a large margin and could be dropped from training if it is observed to contribute significantly to the inaccuracy of the model.
num_col = list(train_df.describe().keys())
cat_col = [x for x in col_list if x not in num_col]
cat_col_bkp = cat_col
row_titles = ['Train','Valid']
no_row = len(row_titles)
no_num_col = len(col_list)
fig_subplot = make_subplots(rows=no_row,cols=no_num_col,row_titles=row_titles)
df_list = [train_df,valid_df]
#Train vs Validation rows
for i,j in enumerate(df_list):
#Iterating through the variables
for k,l in enumerate(j.columns):
fig_subplot.append_trace(go.Violin(y=j[l].values,name=str(l),box_visible=False,meanline_visible=True,fillcolor='lightyellow',line_color='grey'),row=i+1,col=k+1)
fig_subplot.layout.update(showlegend=False,title_text = 'Figure 2.2: Violin Plot Data Distribution of Variables',width=12800,height=720)
fig_subplot.show()
## conduct smogn
train_smogn = smogn.smoter(
## main arguments
data = train_df, ## pandas dataframe
y = 'SalePrice', ## string ('header name')
k = 9, ## positive integer (k < n)
samp_method = 'extreme', ## string ('balance' or 'extreme')
## phi relevance arguments
rel_thres = 0.80, ## positive real number (0 < R < 1)
rel_method = 'auto', ## string ('auto' or 'manual')
rel_xtrm_type = 'high', ## string ('low' or 'both' or 'high')
rel_coef = 2.25 ## positive real number (0 < R)
)
train_smogn = train_smogn.reset_index(drop=True)
#Creating a combined train-validation dataset for imputation, scaling etc.
train_group_list = [0] * len(train_smogn) #Group 0 = train
valid_group_list = [1] * len(valid_df) #Group 1 = validation
train_group_df = pd.DataFrame(train_group_list,columns=['Group'])
valid_group_df = pd.DataFrame(valid_group_list,columns=['Group'])
#train_group_df = train_group_df.reset_index(drop=True)
train_df = pd.concat([train_smogn,train_group_df],axis=1)
valid_df = pd.concat([valid_df,valid_group_df],axis=1)
#Create a pipeline to turn categorical values to numerical
comb_df = pd.concat([train_df,valid_df],axis=0)
comb_df = comb_df.reset_index(drop=True) #Reseting the index after combining
#Adding the 'Group' column back into num_col
if('Group' not in num_col):
num_col.append('Group')
comb_cat_df = comb_df[cat_col]
comb_num_df = comb_df[num_col]
Figure 2.2 are violin plots illustrating the data distribution by variable. The shape of the violin plots makes the categorical and numerical variables differetiable visually. Categorical variables have the datapoints split to the ends of the violin plot, where as numerical variables are more normally distributed. Based on the distribution, v17 has been falsely assumed to be a numerical variable. It is actually a binary classified categorical variable but does not need to be normalized.
There are certain variables that have different data distributions for the training and validation datasets, for example v8, v12 and v17. This might affect the accuracy of prediction for the validation set.
It is also observed that the training dataset has classLabel imbalance with much more 'yes' than 'no' examples, where as the validation dataset has a more equal distribution of 'yes' and 'no' examples which might also affect the accuracy of the model. This could potentially be offset by undersampling of the examples with 'yes' classLabels or oversampling of the examples with 'no' classLabels during training.
Next, the two parts of the dataset are combined for further visualization.
cat_col = ['BldgType','HouseStyle','ExterQual','Foundation','BsmtCond','CentralAir','KitchenQual','GarageType','GarageQual','GarageCond','PoolQC','Fence','SaleType']
list_len = len(cat_col)
combo_list = []
fig_cm = make_subplots(rows=list_len,cols=list_len,row_titles=cat_col,column_titles=cat_col)
#Iterating through the variables
for i,j in enumerate(cat_col):
#Iterating through the variables
for k,l in enumerate(cat_col):
values = comb_cat_df.groupby(j).aggregate(l).value_counts(normalize=True).unstack().values
values = np.round(values,2)
x_keys = list(comb_cat_df[j].unique())
if(np.nan in x_keys):
x_keys.remove(np.nan)
y_keys = list(comb_cat_df[l].unique())
if(np.nan in y_keys):
y_keys.remove(np.nan)
fig_cm.append_trace(go.Heatmap(
x = x_keys,
y = y_keys,
z = values,
type = 'heatmap',
colorscale = 'Oryel',
text = values,
texttemplate = '%{text}',
hoverongaps = False,
colorbar = dict(tick0=0.0,dtick=0.1),
zmin=0.0,
zmax=1.0
),row=i+1,col=k+1)
fig_cm.update_xaxes(tickson='boundaries')
fig_cm.update_yaxes(tickson='boundaries')
fig_cm.layout.update(width=4320,height=1620,title_text='Figure 2.3: Data Distribution Heatmap for Categorical Variables')
fig_cm.show()
Figure 2.3 illustrates the data distribution heatmap for categorical variables. The purpose of plotting the data distribution heatmap for categorical variables is to visually discover data distribution skews in variables with respect to other variables and the classLabel.
For example in the second row of the first column, variable v4 is more likely to be skewed in favor of 'y' for when v1 is 'b'. On the other hand, it is equally distributed for when v1 is 'a', while 'l' is rarely seen. Note that the diagonal consist of exchange matrices (all 0s with 1s along the matrix diagonal).
It is observed that all variables are skewed with respect to classLabel but are balanced with respect to each other with the exception of v12, which is skewed even when compared to every other variable.
#Get the correlation matrix
corr_matrix = comb_df.corr()
corr_col_list = corr_matrix.columns
for col in corr_col_list:
corr_matrix[col] = corr_matrix[col].apply(lambda x:'%.2f' %x)
#Create a heatmap based on the matrix
fig_corr = px.imshow(corr_matrix,text_auto=True,color_continuous_scale='oryel')
fig_corr.layout.update(width=2160,height=1620,title_text='Figure 2.4: Correlation Heatmap for Numerical Variables',xaxis_title='Variables',yaxis_title='Variables')
fig_corr.show()
Figure 2.4 illustrates the correlation heatmap for numerical variables. Variables v13 and v15 are almost fully correlated so one of them can be dropped if either is observed to contribute significantly to the inaccuracy of the model. As expected, the diagonal consist of 1s because each variable is fully correlated with itself.
The categorical variables are imputed with the mode and converted to numerical values using the OrdinalEncoder, while the numerical values are imputed with the k-Nearest Neighbors Imputer and normalized to a value between 0 and 1.
mms = MinMaxScaler()
#Categorical pipeline for preprocessing categorical variables
cat_pipeline = make_pipeline(
SimpleImputer(strategy='most_frequent'),
OrdinalEncoder()
)
#Numerical pipeline for preprocessing numerical variables
num_pipeline = make_pipeline(
KNNImputer(n_neighbors=5),
mms,
)
#Dropping variable v17 from the numerical dataframe, as it does not need to be preprocessed
#v17_df = comb_num_df['v17']
#num_col.remove('v17')
#comb_num_df = comb_num_df.drop(columns=['v17'])
cat_col = cat_col_bkp
comb_cat_arr = cat_pipeline.fit_transform(comb_cat_df)
comb_cat_df = pd.DataFrame(comb_cat_arr,columns=cat_col)
comb_num_df_bkp = comb_num_df
comb_num_arr = num_pipeline.fit_transform(comb_num_df)
comb_num_df = pd.DataFrame(comb_num_arr,columns=num_col)
no_of_dummy_col = comb_num_df.shape[1] - 1
#Adding variable v17 to the categorical dataframe
#comb_num_df = pd.concat([comb_num_df,v17_df],axis=1)
#Concatenating the processed numerical and categorical dataframes
proc_comb_df = pd.concat([comb_num_df,comb_cat_df,],axis=1)
#The data is saved in the dataframe to be able to perform multiple runs on the data, including dropping variables
temp_df = proc_comb_df
A Keras sequential model with two layers is used. The first layer consist of a Dense layer with the ReLU (rectified linear unit) activation function. The output layer uses a sigmoid activation function since it is a binary classification task. Binary crossentropy loss is used for training and accuracy is the metric to track.
def uncompiled_seq_model():
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(40,activation='relu',kernel_initializer=tf.keras.initializers.random_normal(seed=seed)),
tf.keras.layers.Dense(10,activation='relu',kernel_initializer=tf.keras.initializers.random_normal(seed=seed)),
tf.keras.layers.Dense(1,activation='relu',kernel_initializer=tf.keras.initializers.random_normal(seed=seed))
])
return model
def seq_model(lr=0.001):
model = uncompiled_seq_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
loss='mae',
metrics=['mae'])
return model
def model_predict(model,valid_data):
(x_valid,y_valid) = valid_data
yhat_valid = model.predict(x_valid)
results = yhat_valid
return results
Next, the combined dataframe is split back to train and valid datasets, respectively. The 'Group' label can be dropped as it does not contain information useful for training the model.
The train data is used for model training and the results checked against the validation dataset.
#Resplit the variable back into training and validation (Group)
proc_valid_df = proc_comb_df[proc_comb_df['Group']==1]
proc_train_df = proc_comb_df[proc_comb_df['Group']==0]
proc_train_y = proc_train_df['SalePrice']
proc_train_x = proc_train_df.drop(columns=['SalePrice'])
proc_train_x = proc_train_df.drop(columns=['Group'])
#proc_valid_y = proc_valid_df['classLabel']
#proc_valid_x = proc_valid_df.drop(columns=['classLabel'])
proc_valid_x = proc_valid_df.drop(columns=['Group'])
#x_valid = proc_valid_x
#y_valid = proc_valid_y
x_train = proc_train_x
y_train = proc_train_y
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.2)
#Backup for more training iterations
x_test_bkp = x_test
y_test_bkp = y_test
x_train_bkp = x_train
y_train_bkp = y_train
#Measuring the model against validation data
model = seq_model()
checkpoint_filepath = wd + '//checkpoint//model.ckpt'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=True,
monitor='val_mae',
mode='min',
save_best_only=True)
model.fit(x_train,y_train,epochs=200,validation_data=(x_test,y_test),verbose=2,callbacks=[model_checkpoint_callback])
test_data = (x_test,y_test)
model.load_weights(checkpoint_filepath)
yhat_test = model_predict(model,test_data)
dummy_arr = np.zeros((np.shape(y_test)[0],no_of_dummy_col))
y_test = np.expand_dims(y_test,axis=1)
y_test = np.concatenate((dummy_arr,y_test),axis=1)
yhat_test = np.concatenate((dummy_arr,yhat_test),axis=1)
#That good line
scaler = MinMaxScaler().fit(comb_num_df_bkp['SalePrice'].to_numpy().reshape(-1,1))
y_test = scaler.inverse_transform(y_test)
yhat_test = scaler.inverse_transform(yhat_test)
print(mean_absolute_error(yhat_test[:,-1],y_test[:,-1]))
#print("Validation accuracy:%.2f" %valid_accuracy)
#column_list = ['v1','v2','v3','v4','v5','v6','v7','v8','v9','v10','v11','v12','v13','v14','v15','v16','v17',]
column_list = num_col + cat_col
column_list.remove('SalePrice')
column_list.remove('Group')
var_list = []
accuracy_list = []
for col in column_list:
model = seq_model()
x_train = x_train_bkp
y_train = y_train_bkp
x_test = x_test_bkp
y_test = y_test_bkp
x_train = x_train.drop(columns=[col])
x_test = x_test.drop(columns=[col])
print("\nEvaluation of model without variable %s" %col)
model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test),verbose=0)
test_data = (x_test,y_test)
yhat_test = model_predict(model,test_data)
test_mae = float(mean_absolute_error(yhat_test,y_test))
var_list.append(col)
accuracy_list.append(test_mae)
print("MAE: %.4f" %test_mae)
var_df = pd.DataFrame(var_list,columns=['var'])
accuracy_df = pd.DataFrame(accuracy_list,columns=['accuracy'])
accuracy_df = pd.concat([var_df,accuracy_df],axis=1)
accuracy_df = accuracy_df.sort_values('accuracy',ascending=False)
#print(accuracy_df)
#accuracy_df['accuracy'] = accuracy_df['accuracy'].apply(lambda x:float(x))
var_to_exclude = list(accuracy_df[accuracy_df['accuracy']<0.033]['var'])
x_train = x_train_bkp
y_train = y_train_bkp
x_test = x_test_bkp
y_test = y_test_bkp
lr_epochs = 120
model = seq_model()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-6 * 10**(epoch / 20))
history = model.fit(x_train,y_train,epochs=lr_epochs,validation_data=(x_test,y_test),verbose=2,callbacks=[lr_schedule])
lr_history = history.history["lr"]
loss_history = history.history["loss"]
fig_lr = go.Figure()
fig_lr.add_trace(go.Line(
x = lr_history,
y = loss_history,
legendgroup = 'plot',
legendgrouptitle_text = 'plot',
name = 'Loss',
marker = dict(color='Red'),
))
fig_lr.layout.update(xaxis_range=[0,5e-3],title_text='Figure 4.3: Loss vs. Learning Rate',xaxis_title='Learning rate',yaxis_title='Loss')
fig_lr.show()
The above graph (Figure 4.3) shows the loss plotted against the learning rate. The optimal learning rate is immediately right of the elbow of the graph, which is observed to be approximately 0.001. That learning rate is used in the final model.
Now it is known that dropping certain variables makes the model prediction more accurate. Thus the top 3 variables that contribute to the inaccuracy is selected to be dropped from the final model. Conversely, the variables with the least contribution to the inaccuracy can be considered vital to the accuracy of the model and must be included during training.
Further, Figure 2.2 also illustrated that classLabel was imbalanced with more 'yes' than 'no' examples. In order to get a more balanced model, RandomOverSampler is used to train the model on more of the 'no' examples.
model = seq_model(lr=0.001)
#Automatically exclude the top 3 variables that contribute to the model inaccuracy
#var_to_exclude = list(accuracy_df[:3]['var'])
x_train = x_train_bkp
y_train = y_train_bkp
x_test = x_test_bkp
y_test = y_test_bkp
#print(x_train.columns)
x_train = x_train.drop(columns=var_to_exclude)
x_test = x_test.drop(columns=var_to_exclude)
#Random Over Sampler to reduce the effect of classLabel imbalance
#ros = RandomOverSampler()
#x_train, y_train = ros.fit_resample(x_train,y_train)
#Save the best model to avoid overfitting
checkpoint_filepath = wd + '//ckpt//model.ckpt'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
save_weights_only=True,
monitor='val_mae',
mode='min',
save_best_only=True)
#print("\nEvaluation of model without variables %s, %s and %s" %(var_to_exclude[0],var_to_exclude[1],var_to_exclude[2]))
history = model.fit(x_train,y_train,epochs=200,validation_data=(x_test,y_test),callbacks=[model_checkpoint_callback],verbose=2)
model.load_weights(checkpoint_filepath)
test_data = (x_test,y_test)
yhat_test = model_predict(model,test_data)
#dummy_arr = np.zeros((np.shape(y_test)[0],no_of_dummy_col))
y_test = np.expand_dims(y_test,axis=1)
#y_test = np.concatenate((dummy_arr,y_test),axis=1)
#yhat_test = np.concatenate((dummy_arr,yhat_test),axis=1)
scaler = MinMaxScaler().fit(comb_num_df_bkp['SalePrice'].to_numpy().reshape(-1,1))
y_test = scaler.inverse_transform(y_test)
yhat_test = scaler.inverse_transform(yhat_test)
print(mean_absolute_error(yhat_test[:,-1],y_test[:,-1]))
acc = history.history['mae']
val_acc = history.history['val_mae']
loss = history.history['loss']
val_loss = history.history['val_loss']
epoch = list(range(len(acc)))
fig_acc = go.Figure()
fig_acc.add_trace(go.Line(
x = epoch,
y = acc,
legendgroup = 'Plot',
legendgrouptitle_text = 'Plot',
name = 'Accuracy',
marker = dict(color='red'),
))
fig_acc.add_trace(go.Line(
x = epoch,
y = val_acc,
legendgroup = 'Plot',
legendgrouptitle_text = 'Plot',
name = 'Validation Accuracy',
marker = dict(color='orange'),
))
fig_acc.layout.update(title_text='Accuracy and Validation Accuracy Plot',xaxis_title='Epoch',yaxis_title='Accuracy')
fig_acc.show()
fig_loss = go.Figure()
fig_loss.add_trace(go.Line(
x = epoch,
y = loss,
legendgroup = 'Plot',
legendgrouptitle_text = 'Plot',
name = 'Loss',
marker = dict(color='red'),
))
fig_loss.add_trace(go.Line(
x = epoch,
y = val_loss,
legendgroup = 'Plot',
legendgrouptitle_text = 'Plot',
name = 'Validation Loss',
marker = dict(color='orange'),
))
fig_loss.layout.update(title_text='Loss and Validation Loss Plot',xaxis_title='Epoch',yaxis_title='Loss')
fig_loss.show()